{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import matchzoo as mz\n",
    "from matchzoo import DataGenerator\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import keras\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import f1_score\n",
    "import scipy\n",
    "\n",
    "from sklearn.metrics import precision_recall_fscore_support"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load Data and Preprocess It\n",
    "This data is based on WikiQA, but formatted differently.  Basically, I took all the matches, and for each match I also generated a 5 negative samples.  To see the dataset, see the \"matchzoo experiments\" folder."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|█| 716/716 [00:00<00:00, 4180.34it/s]\n",
      "Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit: 100%|█| 1025/1025 [00:00<00:00, 2940.90it/s]\n",
      "Processing text_left with extend: 100%|██████████████████████████████████████████| 716/716 [00:00<00:00, 179034.32it/s]\n",
      "Processing text_right with extend: 100%|███████████████████████████████████████| 1025/1025 [00:00<00:00, 255628.59it/s]\n",
      "Building VocabularyUnit from a datapack.: 100%|███████████████████████████| 112803/112803 [00:00<00:00, 3134287.32it/s]\n",
      "Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|█| 716/716 [00:00<00:00, 5146.64it/s]\n",
      "Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|█| 1025/1025 [00:00<00:00, 2764.34it/s]\n",
      "Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|█| 200/200 [00:00<00:00, 6248.68it/s]\n",
      "Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit => StopRemovalUnit => NgramLetterUnit => WordHashingUnit: 100%|█| 734/734 [00:00<00:00, 2835.54it/s]\n"
     ]
    }
   ],
   "source": [
    "train_raw = mz.pack(pd.read_csv('toy_matchzoo_train.csv', index_col=0))\n",
    "test_raw = mz.pack(pd.read_csv('toy_matchzoo_test.csv', index_col=0))\n",
    "\n",
    "\n",
    "train_raw.relation['label'] = train_raw.relation['label'].astype('float32')\n",
    "test_raw.relation['label'] = test_raw.relation['label'].astype('float32')\n",
    "\n",
    "preprocessor = mz.preprocessors.DSSMPreprocessor()\n",
    "\n",
    "preprocessor.fit(train_raw)\n",
    "preprocessor.context\n",
    "\n",
    "train_preprocessed = preprocessor.transform(train_raw)\n",
    "test_preprocessed = preprocessor.transform(test_raw)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build Model\n",
    "Here we set the Loss Function to Cross Entropy with 5 negative samples.  Am I doing this correctly?  I feel like there may be an issue with the negative samples."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Parameter \"name\" set to DSSM.\n",
      "<matchzoo.losses.rank_cross_entropy_loss.RankCrossEntropyLoss object at 0x000001B68E20FFD0>\n"
     ]
    }
   ],
   "source": [
    "model = mz.models.DSSM()\n",
    "model.params['input_shapes'] = preprocessor.context['input_shapes']\n",
    "model.params['mlp_num_layers'] = 3\n",
    "model.params['mlp_num_units'] = 300\n",
    "model.params['mlp_num_fan_out'] = 128\n",
    "model.params['mlp_activation_func'] = 'relu'\n",
    "\n",
    "ranking_task = mz.tasks.Ranking(loss=mz.losses.RankCrossEntropyLoss(num_neg=5))\n",
    "model.params['task'] = ranking_task\n",
    "\n",
    "model.guess_and_fill_missing_params()\n",
    "\n",
    "model.build()\n",
    "model.compile()\n",
    "print(model._params['task'].loss)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train, y_train = train_preprocessed.unpack()\n",
    "x_test, y_test = test_preprocessed.unpack()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train Model\n",
    "Batch size is divisible by 6 to account for 1 matching example and 5 negative samples.  I also set shuffle to false, so each set of 6 alternates labels 1,0,0,0,0,0 .  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 4986 samples, validate on 1248 samples\n",
      "Epoch 1/50\n",
      " - 4s - loss: 1.7941 - val_loss: 1.7887\n",
      "Epoch 2/50\n",
      " - 4s - loss: 1.7872 - val_loss: 1.7892\n",
      "Epoch 3/50\n",
      " - 4s - loss: 1.7844 - val_loss: 1.8459\n",
      "Epoch 4/50\n",
      " - 4s - loss: 1.7875 - val_loss: 1.8000\n",
      "Epoch 5/50\n",
      " - 4s - loss: 1.7935 - val_loss: 1.7990\n",
      "Epoch 6/50\n",
      " - 4s - loss: 1.7891 - val_loss: 1.8014\n",
      "Epoch 7/50\n",
      " - 4s - loss: 1.7882 - val_loss: 1.8067\n",
      "Epoch 8/50\n",
      " - 5s - loss: 1.7877 - val_loss: 1.8172\n",
      "Epoch 9/50\n",
      " - 4s - loss: 1.7861 - val_loss: 1.8370\n",
      "Epoch 10/50\n",
      " - 4s - loss: 1.7780 - val_loss: 1.8244\n",
      "Epoch 11/50\n",
      " - 4s - loss: 1.7753 - val_loss: 1.9481\n",
      "Epoch 12/50\n",
      " - 4s - loss: 1.7363 - val_loss: 2.0791\n",
      "Epoch 13/50\n",
      " - 4s - loss: 1.6746 - val_loss: 2.0875\n",
      "Epoch 14/50\n",
      " - 4s - loss: 1.6519 - val_loss: 2.3708\n",
      "Epoch 15/50\n",
      " - 4s - loss: 1.6514 - val_loss: 2.2608\n",
      "Epoch 16/50\n",
      " - 4s - loss: 1.6010 - val_loss: 2.2166\n",
      "Epoch 17/50\n",
      " - 4s - loss: 1.5432 - val_loss: 2.3445\n",
      "Epoch 18/50\n",
      " - 4s - loss: 1.5163 - val_loss: 2.4150\n",
      "Epoch 19/50\n",
      " - 4s - loss: 1.5253 - val_loss: 2.3683\n",
      "Epoch 20/50\n",
      " - 4s - loss: 1.4340 - val_loss: 2.3454\n",
      "Epoch 21/50\n",
      " - 4s - loss: 1.3802 - val_loss: 2.3876\n",
      "Epoch 22/50\n",
      " - 4s - loss: 1.3491 - val_loss: 2.4536\n",
      "Epoch 23/50\n",
      " - 4s - loss: 1.3223 - val_loss: 2.4907\n",
      "Epoch 24/50\n",
      " - 4s - loss: 1.3031 - val_loss: 2.5204\n",
      "Epoch 25/50\n",
      " - 4s - loss: 1.2794 - val_loss: 2.5238\n",
      "Epoch 26/50\n",
      " - 4s - loss: 1.2666 - val_loss: 2.5806\n",
      "Epoch 27/50\n",
      " - 4s - loss: 1.2463 - val_loss: 2.5536\n",
      "Epoch 28/50\n",
      " - 4s - loss: 1.2393 - val_loss: 2.6493\n",
      "Epoch 29/50\n",
      " - 4s - loss: 1.2326 - val_loss: 2.6233\n",
      "Epoch 30/50\n",
      " - 4s - loss: 1.2184 - val_loss: 2.4849\n",
      "Epoch 31/50\n",
      " - 4s - loss: 1.1884 - val_loss: 2.5647\n",
      "Epoch 32/50\n",
      " - 4s - loss: 1.1553 - val_loss: 2.6256\n",
      "Epoch 33/50\n",
      " - 4s - loss: 1.1350 - val_loss: 2.6358\n",
      "Epoch 34/50\n",
      " - 4s - loss: 1.1210 - val_loss: 2.6452\n",
      "Epoch 35/50\n",
      " - 4s - loss: 1.1018 - val_loss: 2.6674\n",
      "Epoch 36/50\n",
      " - 4s - loss: 1.0792 - val_loss: 2.6600\n",
      "Epoch 37/50\n",
      " - 4s - loss: 1.0459 - val_loss: 2.6210\n",
      "Epoch 38/50\n",
      " - 4s - loss: 1.0084 - val_loss: 2.5657\n",
      "Epoch 39/50\n",
      " - 4s - loss: 0.9764 - val_loss: 2.5428\n",
      "Epoch 40/50\n",
      " - 4s - loss: 0.9583 - val_loss: 2.6688\n",
      "Epoch 41/50\n",
      " - 4s - loss: 0.9223 - val_loss: 2.6774\n",
      "Epoch 42/50\n",
      " - 4s - loss: 0.8976 - val_loss: 2.6820\n",
      "Epoch 43/50\n",
      " - 4s - loss: 0.8710 - val_loss: 2.7108\n",
      "Epoch 44/50\n",
      " - 4s - loss: 0.8484 - val_loss: 2.7204\n",
      "Epoch 45/50\n",
      " - 4s - loss: 0.8384 - val_loss: 2.7012\n",
      "Epoch 46/50\n",
      " - 4s - loss: 0.8206 - val_loss: 2.7412\n",
      "Epoch 47/50\n",
      " - 4s - loss: 0.7939 - val_loss: 2.7325\n",
      "Epoch 48/50\n",
      " - 4s - loss: 0.7775 - val_loss: 2.7274\n",
      "Epoch 49/50\n",
      " - 4s - loss: 0.7583 - val_loss: 2.7238\n",
      "Epoch 50/50\n",
      " - 4s - loss: 0.7433 - val_loss: 2.7243\n"
     ]
    }
   ],
   "source": [
    "history = model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=120, epochs=50, shuffle=False, verbose=2)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluate\n",
    "\n",
    "Here we predict, but since its cross entropy, the model outputs logits, so we need to apply the sigmoid function.  I'm not sure if this is causing problems.  I think the matchzoo cross entropy uses softmax, which may be causing this problem.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.19028808]\n",
      " [0.12399588]\n",
      " [0.2242804 ]\n",
      " [0.04260809]\n",
      " [0.42978793]\n",
      " [0.28992286]\n",
      " [0.09081242]\n",
      " [0.49973217]\n",
      " [0.12237648]\n",
      " [0.12451662]]\n",
      "[[1.]\n",
      " [0.]\n",
      " [0.]\n",
      " [0.]\n",
      " [0.]\n",
      " [0.]\n",
      " [1.]\n",
      " [0.]\n",
      " [0.]\n",
      " [0.]]\n"
     ]
    }
   ],
   "source": [
    "y_pred = model.predict(x_test)\n",
    "y_pred = scipy.special.expit(y_pred) #sigmoid function\n",
    "\n",
    "print(y_pred[0:10])\n",
    "print(y_test[0:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\patelr9\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 due to no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0\n",
      "0.0\n",
      "0.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\patelr9\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0\n"
     ]
    }
   ],
   "source": [
    "y_pred_cutoff = y_pred > .5\n",
    "\n",
    "prec, recall, f1beta, support = precision_recall_fscore_support(y_test, y_pred_cutoff, pos_label=1, average='binary')\n",
    "\n",
    "print(prec)\n",
    "print(recall)\n",
    "print(f1beta)\n",
    "print(f1_score(y_test, y_pred_cutoff, pos_label=1, average='binary'))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
